Objectives: The goal of this kernel is to analyze the effect and flavor of 3 different types of Cannabis.
EDA includes datatable, skim, 3D plotly, DrillDown and NLP.
If you have any question, please leave a comment and if you like the kernel, please give me an upvote~ Thanks!
library(tidyverse)
library(skimr)
library(highcharter)
library(tm)
library(plotly)
library(viridis)
library(wordcloud)
library(plotrix)
library(DescTools)
library(DT)
library(ggraph)
library(igraph)
library(ggthemes)
library(visNetwork)
library(tidytext)set.seed(1)
weed <- read_csv("../data/cannabis.csv")# weed %>% skim() %>% kable()
weed %>% skim() | Name | Piped data |
| Number of rows | 2351 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Strain | 0 | 1.00 | 2 | 30 | 0 | 2350 | 0 |
| Type | 0 | 1.00 | 6 | 6 | 0 | 3 | 0 |
| Effects | 0 | 1.00 | 4 | 46 | 0 | 1655 | 0 |
| Flavor | 46 | 0.98 | 3 | 30 | 0 | 1293 | 0 |
| Description | 33 | 0.99 | 4 | 1120 | 0 | 2312 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Rating | 0 | 1 | 4.31 | 0.84 | 0 | 4.2 | 4.4 | 4.7 | 5 | ▁▁▁▁▇ |
weed %>%
select(-Description) %>%
datatable(filter = 'top', options = list(
pageLength = 30, autoWidth = TRUE
))weed %>% glimpse()## Rows: 2,351
## Columns: 6
## $ Strain <chr> "100-Og", "98-White-Widow", "1024", "13-Dawgs", "24K-Gold"~
## $ Type <chr> "hybrid", "hybrid", "sativa", "hybrid", "hybrid", "indica"~
## $ Rating <dbl> 4.0, 4.7, 4.4, 4.2, 4.6, 0.0, 4.4, 4.2, 4.6, 4.4, 4.7, 4.5~
## $ Effects <chr> "Creative,Energetic,Tingly,Euphoric,Relaxed", "Relaxed,Aro~
## $ Flavor <chr> "Earthy,Sweet,Citrus", "Flowery,Violet,Diesel", "Spicy/Her~
## $ Description <chr> "$100 OG is a 50/50 hybrid strain that packs a strong punc~
weed$Rating %>% PlotFdist("Cannabis Ranking Distribution")Ranking by Types
type <- weed$Type %>% unique()
for (i in 1:length(type)){
weed[weed$Type == type[i],]$Rating %>%
PlotFdist(paste("Cannabis Ranking Distribution - ", toupper(type[i])))
}hcboxplot(weed$Rating, weed$Type, color = 'firebrick') %>%
hc_add_theme(hc_theme_economist()) %>%
hc_chart(type = 'bar') %>%
hc_title(text = "Cannbis Type Ranking Boxplot")by_type <- weed %>%
count(Type)
hchart(by_type, type = 'treemap', hcaes(x = 'Type', value = 'n', color = 'n'))rm(by_type)weed_effects <- weed %>%
mutate(Effects = str_split(Effects,',')) %>%
unnest(Effects)
weed_effects %>%
count(Effects) %>%
hchart(type = 'treemap', hcaes(x = 'Effects', value = 'n', color = 'n'))weed_flavor <- weed %>%
filter(Flavor != 'none') %>%
mutate(Flavor = str_split(Flavor, ',')) %>%
unnest(Flavor)
weed_flavor %>%
count(Flavor) %>%
hchart(type = 'treemap', hcaes(x = 'Flavor', value = 'n', color = 'n'))filtered_weed_effects <- weed_effects %>% filter(Effects != "None")
hcboxplot(filtered_weed_effects $Rating, filtered_weed_effects $Effects, color = 'firebrick') %>%
hc_add_theme(hc_theme_economist()) %>%
hc_chart(type = 'bar') %>%
hc_title(text = "Cannbis Effects Ranking Boxplot")filtered_weed_flavor <- weed_flavor %>% filter(Flavor != "None")
hcboxplot(filtered_weed_flavor$Rating, filtered_weed_flavor$Flavor, color = 'firebrick') %>%
hc_add_theme(hc_theme_economist()) %>%
hc_chart(type = 'bar') %>%
hc_title(text = "Cannbis Flavor Ranking Boxplot")Click the part to Drill Down!
df1 <- weed %>%
group_by(name = Type, drilldown = Type) %>%
summarise(y = n()) %>%
arrange(desc(y))
df2 <- weed_effects %>%
group_by(Type, Effects) %>%
mutate(y = n()
# , colorByPoint = 1
) %>%
arrange(desc(y)) %>%
group_by(name = Type, id = Type
# , colorByPoint
) %>%
do(data = list_parse(
mutate(.,name = Effects, drilldown = tolower(paste(Type, Effects,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=n()) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
a <- highchart() %>%
hc_chart(type = 'pie') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Effects Pie Chart") %>%
hc_add_theme(hc_theme_darkunica())
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Effects Bar Chart") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)rm(a, b, lst)Click the part to Drill Down!
df1 <- weed %>%
group_by(name = Type, drilldown = Type) %>%
summarise(y = n()) %>%
arrange(desc(y))
df2 <- weed_flavor %>%
group_by(Type, Flavor) %>%
mutate(y = n(), colorByPoint = 1) %>%
arrange(desc(y)) %>%
group_by(name = Type, id = Type
# , colorByPoint
) %>%
do(data = list_parse(
mutate(., name = Flavor, drilldown = tolower(paste(Type, Flavor,sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y=n()) %>%
select(name, y, drilldown) %>%
arrange(desc(y))))
a <- highchart() %>%
hc_chart(type = 'pie') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1
# , colorByPoint = 1
) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Flavor Pie Chart") %>%
hc_add_theme(hc_theme_darkunica())
b <- highchart() %>%
hc_chart(type = 'bar') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Type of Cannbis vs Flavor Bar Chart") %>%
hc_add_theme(hc_theme_darkunica())
rm(df1, df2)
lst <- list(
a,
b
)
hw_grid(lst, rowheight = 400)rm(a, b, lst)# clean corpus
cleanCorpus <- function(corpus){
corpus.tmp <- tm_map(corpus, removePunctuation)
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
v_stopwords <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im","strain",
"will","can","cant","dont","youve","us",
"youre","youll","theyre","whats","didnt"))
corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
return(corpus.tmp)
}# frequent terms
frequentTerms <- function(text){
s.cor <- Corpus(VectorSource(text))
s.cor.cl <- cleanCorpus(s.cor)
s.tdm <- TermDocumentMatrix(s.cor.cl)
s.tdm <- removeSparseTerms(s.tdm, 0.999)
m <- as.matrix(s.tdm)
word_freqs <- sort(rowSums(m), decreasing=TRUE)
dm <- data.frame(word=names(word_freqs), freq=word_freqs)
return(dm)
}# clean by each Type
clean_top_char <- function(dataset){
all_dialogue <- list()
namelist <- list()
for (i in 1:3){
top <- dataset %>% count(Type) %>% arrange(desc(n)) %>% head(20)
name <- top$Type[i]
Description <- paste(dataset$Description[dataset$Type == name], collapse = " ")
all_dialogue <- c(all_dialogue, Description)
namelist <- c(namelist, name)
}
all_clean <- all_dialogue %>%
VectorSource() %>%
Corpus() %>%
cleanCorpus() %>%
TermDocumentMatrix() %>%
as.matrix()
colnames(all_clean) <- namelist
assign("all_clean",all_clean,.GlobalEnv)
all_clean %>% head()
}
weed %>% clean_top_char()## Docs
## Terms hybrid indica sativa
## abandon 1 0 0
## abate 8 5 5
## abates 1 1 0
## abating 3 1 0
## abbreviated 1 0 0
## abduct 1 0 0
weed$Description %>%
frequentTerms() %>%
# dim()
head(30) %>%
mutate(word = factor(word))%>%
plot_ly(x = ~reorder(word,-freq), y = ~freq, colors = viridis(10)) %>%
add_bars(color = ~word) %>%
layout(title = "Top 30 Words",
yaxis = list(title = " "),
xaxis = list(title = ""),
margin = list(l = 100))df <- all_clean %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(type, value, -1)
df1 <- df %>%
group_by(name = type, drilldown = type) %>%
summarise(y = sum(value)) %>%
arrange(desc(y))
df2 <- df %>%
group_by(type, rowname) %>%
summarise(total = sum(value)) %>%
arrange(desc(total)) %>%
group_by(name = type, id = type) %>%
do(data = list_parse(
mutate(., name = rowname, drilldown = tolower(paste(type, rowname, sep=": "))) %>%
group_by(name, drilldown) %>%
summarise(y = sum(total)) %>%
select(name, y, drilldown) %>%
arrange(desc(y))) %>%
head(30))
highchart() %>%
hc_chart(type = 'column') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of words in comments', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Top 30 Words by Type of Cannbis") %>%
hc_add_theme(hc_theme_darkunica())rm(df2)df2 <- df %>%
inner_join(get_sentiments("loughran"), by = c('rowname' = 'word')) %>%
group_by(type,sentiment) %>%
summarise(total = sum(value)) %>%
arrange(desc(total)) %>%
group_by(name = type, id = type) %>%
do(data = list_parse(
mutate(., name = sentiment, drilldown = tolower(paste(type, sentiment, sep = ": "))) %>%
group_by(name, drilldown) %>%
summarise(y = sum(total)) %>%
select(name, y , drilldown) %>%
arrange(desc(y))
))
highchart() %>%
hc_chart(type = 'column') %>%
hc_xAxis(type = "category") %>%
hc_add_series(name = 'number of words in comments', data = df1, colorByPoint = 1) %>%
hc_drilldown(
allowPointDrilldown = TRUE,
series =list_parse(df2)
) %>%
hc_legend(enabled = F) %>%
hc_title(text = "Sentiment Analysis by Type of Cannbis") %>%
hc_add_theme(hc_theme_darkunica())rm(df2)Sativa vs Indica
commonality.cloud(all_clean[,c("sativa","indica")], colors = "steelblue1", at.least = 2, max.words = 100)Sativa vs Indica
comparison.cloud(all_clean[,c("sativa","indica")], colors = c("#F8766D", "#00BFC4"), max.words=50)Sativa vs Indica
common_words <- all_clean %>%
as.data.frame() %>%
rownames_to_column() %>%
filter(sativa>0, indica>0) %>%
# select(sativa, indica)
mutate(difference = abs(sativa - indica)) %>%
arrange(desc(difference))
common_words_25 <- common_words%>%
head(25)
pyramid.plot(common_words_25$sativa, common_words_25$indica,
labels = common_words_25$rowname, gap = 200,
top.labels = c("sativa", "Words", "indica"),
main = "Words in Common", laxlab = NULL,
raxlab = NULL, unit = NULL)## 517 517
## [1] 5.1 4.1 4.1 2.1
# rm(common_words, common_words_25)effects <- weed_effects$Effects %>% unique() %>% tolower()
rm(weed_effects)
effectByType <- all_clean %>%
as.data.frame() %>%
rownames_to_column('word') %>%
filter(word %in% effects) %>%
mutate(word=factor(word))
effectByType %>%
plot_ly(x=~hybrid,y=~sativa,z= ~indica, hoverinfo = 'text', colors = viridis(15),
text = ~paste('Effects:', word
,'<br>hybrid:', hybrid,
'<br>sativa:', sativa,
'<br>indica:', indica
)) %>%
add_markers(opacity = 0.8) %>%
layout(title = "Effects by Different Cannabis",
annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="Effects",showarrow=F),
scene = list(xaxis = list(title = 'hybrid'),
yaxis = list(title = 'sativa'),
zaxis = list(title = 'indica')))effectByType %>%
mutate(type = 'hybrid') %>%
select(word, type, hybrid) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = hybrid), show.legend = FALSE) +
geom_node_point(color = "firebrick", size = 20, alpha = .5) +
geom_node_text(aes(label = name), col = "white") +
theme_solarized(light = F)network3_edg <- effectByType %>%
mutate(type = 'hybrid') %>%
select(word, type, hybrid) %>%
filter(hybrid>0) %>%
rename(from = type, to = word, weight = hybrid, width = hybrid)
network3_node <- effectByType %>%
mutate(type = 'hybrid') %>%
filter(hybrid>0) %>%
select(word, hybrid) %>%
rename(id = word, size = hybrid)
network3_node$label <- network3_node$id # Node label
network3_node <- bind_rows(network3_node,data.frame(id = 'hybrid', size = 24, label = 'hybrid'))
visNetwork(network3_node, network3_edg, height = "500px", width = "100%") %>%
# visIgraphLayout(layout = "layout_with_lgl") %>%
visEdges(shadow = TRUE,
color = list(color = "gray", highlight = "orange"))network3_edg <- effectByType %>%
mutate(type = 'hybrid') %>%
select(word, type, hybrid) %>%
filter(hybrid>0) %>%
rename(from = type, to = word, weight = hybrid, width = hybrid)
network3_node <- effectByType %>%
mutate(type = 'hybrid') %>%
filter(hybrid>0) %>%
select(word, hybrid) %>%
rename(id = word, size = hybrid)
network3_node$label <- network3_node$id # Node label
network3_node <- bind_rows(network3_node,data.frame(id = 'hybrid', size = 24, label = 'hybrid'))
visNetwork(network3_node, network3_edg, height = "500px", width = "100%") %>%
visIgraphLayout(layout = "layout_with_lgl") %>%
visEdges(shadow = TRUE,
color = list(color = "gray", highlight = "orange"))network3_edg <- effectByType %>%
mutate(type = 'hybrid') %>%
select(word, type, hybrid) %>%
filter(hybrid>0) %>%
rename(from = type, to = word)
network3_node <- effectByType %>%
mutate(type = 'hybrid') %>%
filter(hybrid>0) %>%
select(word) %>%
rename(id = word)
network3_node$label <- network3_node$id # Node label
network3_node <- bind_rows(network3_node,data.frame(id = 'hybrid', label = 'hybrid'))
visNetwork(network3_node, network3_edg, height = "500px", width = "100%") %>%
# visIgraphLayout(layout = "layout_with_lgl") %>%
visEdges(shadow = TRUE,
color = list(color = "gray", highlight = "orange"))Hope you enjoyed the cannabis analysis